Artificial Neural Networks (ANN)
Table of Contents
Perceptron
XOR Problem
| $x_1$ | $x_2$ | $x_1$ XOR $x_2$ |
|---|---|---|
| 0 | 0 | 0 |
| 0 | 1 | 1 |
| 1 | 0 | 1 |
| 1 | 1 | 0 |
Neurons compute the weighted sum of their inputs
A neuron is activated or fired when the sum $a$ is positive
$$
\begin{align*}
a &= \omega_0 + \omega_1 x_1 + \omega_2 x_2 \\ \\
\hat{y} &= g(a) =
\begin{cases}
1 & a > 0\\
0 & \text{otherwise}
\end{cases}
\end{align*}
$$
Differentiable activation function
In a compact representation
Multi-layer perceptron
We can represent this “neuron” as follows:
The main weakness of linear predictors is their lack of capacity. For classification, the populations have to be linearly separable.
The XOR example can be solved by pre-processing the data to make the two populations linearly separable.
Universal function approximator Universal function classifier
Parameterized
Example: Linear Classifier
Example: Neural Networks
colah's blog
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import time
%matplotlib inline
#training data gerneration
m = 1000
x1 = 8*np.random.rand(m, 1)
x2 = 7*np.random.rand(m, 1) - 4
g = 0.8*x1 + x2 - 3
C1 = np.where(g >= 0)[0]
C0 = np.where(g < 0)[0]
N = C1.shape[0]
M = C0.shape[0]
m = N + M
X1 = np.hstack([np.ones([N,1]), x1[C1], x2[C1]])
X0 = np.hstack([np.ones([M,1]), x1[C0], x2[C0]])
train_X = np.vstack([X1, X0])
train_y = np.vstack([np.ones([N,1]), -np.ones([M,1])])
train_X = np.asmatrix(train_X)
train_y = np.asmatrix(train_y)
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.legend(loc = 1, fontsize = 15)
plt.xlabel(r'$x_1$', fontsize = 15)
plt.ylabel(r'$x_2$', fontsize = 15)
plt.show()
train_y = np.vstack([np.ones([N,1]), np.zeros([M,1])])
train_y = np.asmatrix(train_y)
import tensorflow as tf
LR = 0.05
n_iter = 15000
x = tf.placeholder(tf.float32, [None, 3])
y = tf.placeholder(tf.float32, [None, 1])
w = tf.Variable(tf.random_normal([3,1]))
y_pred = tf.matmul(x,w)
loss = tf.nn.sigmoid_cross_entropy_with_logits(logits = y_pred, labels = y)
loss = tf.reduce_mean(loss)
optm = tf.train.GradientDescentOptimizer(LR).minimize(loss)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(n_iter):
sess.run(optm, feed_dict = {x: train_X, y: train_y})
w_hat = sess.run(w)
x1p = np.arange(0, 8, 0.01).reshape(-1, 1)
x2p = - w_hat[1,0]/w_hat[2,0]*x1p - w_hat[0,0]/w_hat[2,0]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'g', linewidth = 3, label = '')
plt.xlim([0, 8])
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.show()
# define input and output size
n_input = 3
n_output = 1
# define weights as a dictionary
weights = {
'output' : tf.Variable(tf.random_normal([n_input, n_output], stddev = 0.1))
}
# define placeholders for train_x and train_y
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
# define network architecture
def build_model(x, weights):
output = tf.matmul(x, weights['output'])
return output
# define loss
pred = build_model(x, weights)
loss = tf.nn.sigmoid_cross_entropy_with_logits(logits = pred, labels = y)
loss = tf.reduce_mean(loss)
LR = 0.05
optm = tf.train.GradientDescentOptimizer(LR).minimize(loss)
n_batch = 50 # Batch size
n_iter = 15000 # Learning iteration
n_prt = 250 # Print cycle
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
# training or learning
loss_record = []
for epoch in range(n_iter):
sess.run(optm, feed_dict = {x: train_X, y: train_y})
if epoch % n_prt == 0:
loss_record.append(sess.run(loss, feed_dict = {x: train_X, y: train_y}))
w_hat = sess.run(weights['output'])
plt.figure(figsize=(10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record)
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.show()
x1p = np.arange(0, 8, 0.01).reshape(-1, 1)
x2p = - w_hat[1,0]/w_hat[2,0]*x1p - w_hat[0,0]/w_hat[2,0]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'g', linewidth = 3, label = '')
plt.xlim([0, 8])
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.show()
Weights and Bias
n_input = 2
n_output = 1
train_X = train_X[:,1:3]
# define network
def build_model(x, weights, biases):
output = tf.add(tf.matmul(x, weights['output']), biases['output'])
return output
weights = {
'output' : tf.Variable(tf.random_normal([n_input, n_output], stddev = 0.1))
}
biases = {
'output' : tf.Variable(tf.random_normal([n_output], stddev = 0.1))
}
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
pred = build_model(x, weights, biases)
loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=pred, labels=y)
loss = tf.reduce_mean(loss)
LR = 0.05
optm = tf.train.GradientDescentOptimizer(LR).minimize(loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
n_batch = 50
n_iter = 15000
n_prt = 250
loss_record = []
for epoch in range(n_iter):
sess.run(optm, feed_dict = {x: train_X, y: train_y})
if epoch % n_prt == 0:
loss_record.append(sess.run(loss, feed_dict = {x: train_X, y: train_y}))
w_hat = sess.run(weights['output'])
b_hat = sess.run(biases['output'])
plt.figure(figsize=(10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record)
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.show()
x1p = np.arange(0, 8, 0.01).reshape(-1, 1)
x2p = - w_hat[0,0]/w_hat[1,0]*x1p - b_hat[0]/w_hat[1,0]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'g', linewidth = 3, label = '')
plt.xlim([0, 8])
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.show()
One-hot Encoding
$$y^{(i)} \in \{1,0\} \quad \implies \quad y^{(i)} \in \{[0,1],[1,0]\}$$
tf.nn.sigmoid_cross_entropy_with_logits $\rightarrow$ tf.nn.softmax_cross_entropy_with_logitsfrom sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore')
train_y = ohe.fit_transform(train_y).toarray()
print(train_y)
n_input = 2
n_output = 2
weights = {
'output' : tf.Variable(tf.random_normal([n_input, n_output], stddev = 0.1))
}
biases = {
'output' : tf.Variable(tf.random_normal([n_output], stddev = 0.1))
}
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
pred = build_model(x, weights, biases)
loss = tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)
loss = tf.reduce_mean(loss)
LR = 0.05
optm = tf.train.GradientDescentOptimizer(LR).minimize(loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
n_batch = 50
n_iter = 15000
n_prt = 250
loss_record = []
for epoch in range(n_iter):
sess.run(optm, feed_dict = {x: train_X, y: train_y})
if epoch % n_prt == 0:
loss_record.append(sess.run(loss, feed_dict = {x: train_X, y: train_y}))
w_hat = sess.run(weights['output'])
b_hat = sess.run(biases['output'])
plt.figure(figsize=(10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record)
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.show()
print(w_hat)
x1p = np.arange(0, 8, 0.01).reshape(-1, 1)
x2p = - w_hat[0,0]/w_hat[1,0]*x1p - b_hat[0]/w_hat[1,0]
x3p = - w_hat[0,1]/w_hat[1,1]*x1p - b_hat[1]/w_hat[1,1]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.xlim([0, 8])
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.show()
# training data gerneration
m = 1000
x1 = 10*np.random.rand(m, 1) - 5
x2 = 8*np.random.rand(m, 1) - 4
g = - 0.5*(x1-1)**2 + 2*x2 + 5
C1 = np.where(g >= 0)[0]
C0 = np.where(g < 0)[0]
N = C1.shape[0]
M = C0.shape[0]
m = N + M
X1 = np.hstack([x1[C1], x2[C1]])
X0 = np.hstack([x1[C0], x2[C0]])
train_X = np.vstack([X1, X0])
train_X = np.asmatrix(train_X)
train_y = np.vstack([np.ones([N,1]), np.zeros([M,1])])
ohe = OneHotEncoder(handle_unknown='ignore')
train_y = ohe.fit_transform(train_y).toarray()
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.legend(loc = 1, fontsize = 15)
plt.xlabel(r'$x_1$', fontsize = 15)
plt.ylabel(r'$x_2$', fontsize = 15)
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
n_input = 2
n_hidden = 2
n_output = 2
weights = {
'hidden' : tf.Variable(tf.random_normal([n_input, n_hidden], stddev = 0.1)),
'output' : tf.Variable(tf.random_normal([n_hidden, n_output], stddev = 0.1))
}
biases = {
'hidden' : tf.Variable(tf.random_normal([n_hidden], stddev = 0.1)),
'output' : tf.Variable(tf.random_normal([n_output], stddev = 0.1))
}
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
def build_model(x, weights, biases):
hidden = tf.add(tf.matmul(x, weights['hidden']), biases['hidden'])
hidden = tf.nn.sigmoid(hidden)
output = tf.add(tf.matmul(hidden, weights['output']), biases['output'])
return output
pred = build_model(x, weights, biases)
loss = tf.nn.softmax_cross_entropy_with_logits(logits = pred, labels = y)
loss = tf.reduce_mean(loss)
LR = 0.01
optm = tf.train.GradientDescentOptimizer(LR).minimize(loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
n_batch = 50
n_iter = 50000
n_prt = 250
loss_record = []
for epoch in range(n_iter):
sess.run(optm, feed_dict = {x: train_X, y: train_y})
if epoch % n_prt == 0:
loss_record.append(sess.run(loss, feed_dict = {x: train_X, y: train_y}))
w_hat = sess.run(weights)
b_hat = sess.run(biases)
plt.figure(figsize=(10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record)
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.show()
H = train_X*w_hat['hidden'] + b_hat['hidden']
H = 1/(1 + np.exp(-H))
plt.figure(figsize=(10, 8))
plt.plot(H[0:N,0], H[0:N,1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(H[N:m,0], H[N:m,1], 'bo', alpha = 0.4, label = 'C0')
plt.xlabel('$z_1$', fontsize = 15)
plt.ylabel('$z_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.axis('equal')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()
x1p = np.arange(0, 1, 0.01).reshape(-1, 1)
x2p = - w_hat['output'][0,0]/w_hat['output'][1,0]*x1p - b_hat['output'][0]/w_hat['output'][1,0]
x3p = - w_hat['output'][0,1]/w_hat['output'][1,1]*x1p - b_hat['output'][1]/w_hat['output'][1,1]
plt.figure(figsize=(10, 8))
plt.plot(H[0:N,0], H[0:N,1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(H[N:m,0], H[N:m,1], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.xlabel('$z_1$', fontsize = 15)
plt.ylabel('$z_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.axis('equal')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()
x1p = np.arange(-5, 5, 0.01).reshape(-1, 1)
x2p = - w_hat['hidden'][0,0]/w_hat['hidden'][1,0]*x1p - b_hat['hidden'][0]/w_hat['hidden'][1,0]
x3p = - w_hat['hidden'][0,1]/w_hat['hidden'][1,1]*x1p - b_hat['hidden'][1]/w_hat['hidden'][1,1]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.axis('equal')
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
# training data gerneration
m = 1000
x1 = 10*np.random.rand(m, 1) - 5
x2 = 8*np.random.rand(m, 1) - 4
g = - 0.5*(x1*x2-1)**2 + 2*x2 + 5
C1 = np.where(g >= 0)[0]
C0 = np.where(g < 0)[0]
N = C1.shape[0]
M = C0.shape[0]
m = N + M
X1 = np.hstack([x1[C1], x2[C1]])
X0 = np.hstack([x1[C0], x2[C0]])
train_X = np.vstack([X1, X0])
train_X = np.asmatrix(train_X)
train_y = np.vstack([np.ones([N,1]), np.zeros([M,1])])
ohe = OneHotEncoder(handle_unknown='ignore')
train_y = ohe.fit_transform(train_y).toarray()
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.legend(loc = 1, fontsize = 15)
plt.xlabel(r'$x_1$', fontsize = 15)
plt.ylabel(r'$x_2$', fontsize = 15)
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
n_input = 2
n_hidden = 4
n_output = 2
def build_model(x, weights, biases):
hidden = tf.add(tf.matmul(x, weights['hidden']), biases['hidden'])
hidden = tf.nn.sigmoid(hidden)
output = tf.add(tf.matmul(hidden, weights['output']), biases['output'])
return output
weights = {
'hidden' : tf.Variable(tf.random_normal([n_input, n_hidden], stddev = 0.1)),
'output' : tf.Variable(tf.random_normal([n_hidden, n_output], stddev = 0.1))
}
biases = {
'hidden' : tf.Variable(tf.random_normal([n_hidden], stddev = 0.1)),
'output' : tf.Variable(tf.random_normal([n_output], stddev = 0.1))
}
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
pred = build_model(x, weights, biases)
loss = tf.nn.softmax_cross_entropy_with_logits(logits = pred, labels = y)
loss = tf.reduce_mean(loss)
LR = 0.01
optm = tf.train.GradientDescentOptimizer(LR).minimize(loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
n_batch = 50
n_iter = 80000
n_prt = 250
# Training cycle
loss_record = []
for epoch in range(n_iter):
sess.run(optm, feed_dict = {x: train_X, y: train_y})
if epoch % n_prt == 0:
loss_record.append(sess.run(loss, feed_dict = {x: train_X, y: train_y}))
w_hat = sess.run(weights)
b_hat = sess.run(biases)
# plots
plt.figure(figsize=(10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record)
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.show()
x1p = np.arange(-5, 5, 0.01).reshape(-1, 1)
x2p = - w_hat['hidden'][0,0]/w_hat['hidden'][1,0]*x1p - b_hat['hidden'][0]/w_hat['hidden'][1,0]
x3p = - w_hat['hidden'][0,1]/w_hat['hidden'][1,1]*x1p - b_hat['hidden'][1]/w_hat['hidden'][1,1]
x4p = - w_hat['hidden'][0,2]/w_hat['hidden'][1,2]*x1p - b_hat['hidden'][2]/w_hat['hidden'][1,2]
x5p = - w_hat['hidden'][0,3]/w_hat['hidden'][1,3]*x1p - b_hat['hidden'][3]/w_hat['hidden'][1,3]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.plot(x1p, x4p, 'm', linewidth = 3, label = '')
plt.plot(x1p, x5p, 'c', linewidth = 3, label = '')
plt.xlabel('$x_1$', fontsize = 15)
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.axis('equal')
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
One of the central ideas of computer science
Depends on solutions to smaller instances of the same problem ( = subproblem)
Function to call itself (it is impossible in the real world)
$$n ! = n \cdot (n-1) \cdots 2 \cdot 1$$
n = 5
m = 1
for i in range(n):
m = m*(i+1)
print(m)
def fac(n):
if n == 1:
return 1
else:
return n*fac(n-1)
# recursive
fac(5)
Dynamic Programming: general, powerful algorithm design technique
Fibonacci numbers:
# naive Fibonacci
def fib(n):
if n <= 2:
return 1
else:
return fib(n-1) + fib(n-2)
fib(10)
# Memorized DP Fibonacci
def mfib(n):
global memo
if memo[n-1] != 0:
return memo[n-1]
elif n <= 2:
memo[n-1] = 1
return memo[n-1]
else:
memo[n-1] = mfib(n-1) + mfib(n-2)
return memo[n-1]
import numpy as np
n = 10
memo = np.zeros(n)
mfib(n)
n = 30
%timeit fib(30)
memo = np.zeros(n)
%timeit mfib(30)
$=$ Learning or estimating weights and biases of multi-layer perceptron from training data
3 key components
In mathematical expression
$$\begin{align*}
\min_{\omega} \quad &f(\omega)
\end{align*}
$$
$$ \min_{\omega} \sum_{i=1}^{m}\ell\left( h_{\omega}\left(x^{(i)}\right),y^{(i)}\right)$$
Learning weights and biases from data using gradient descent
Backpropagation
Chain Rule
Computing the derivative of the composition of functions
$\space f(g(x))' = f'(g(x))g'(x)$
$\space {dz \over dx} = {dz \over dy} \bullet {dy \over dx}$
$\space {dz \over dw} = ({dz \over dy} \bullet {dy \over dx}) \bullet {dx \over dw}$
$\space {dz \over du} = ({dz \over dy} \bullet {dy \over dx} \bullet {dx \over dw}) \bullet {dw \over du}$
Backpropagation
Update weights recursively with memory
Optimization procedure
Summary
From Wikipedia
More here
We will be using MNIST to create a Multinomial Classifier that can detect if the MNIST image shown is a member of class 0,1,2,3,4,5,6,7,8 or 9. Susinctly, we're teaching a computer to recognize hand written digets.
# import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"]="0"
# Import Library
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline
Let's download and load the dataset.
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
print ("The training data set is:\n")
print (mnist.train.images.shape)
print (mnist.train.labels.shape)
print ("The test data set is:")
print (mnist.test.images.shape)
print (mnist.test.labels.shape)
Display a few random samples from it:
mnist.train.images[5]
# well, that's not a picture (or image), it's an array.
mnist.train.images[5].shape
You might think the training set is made up of 28 $\times$28 grayscale images of handwritten digits. No !!!
The thing is, the image has been flattened. These are 28x28 images that have been flattened into a 1D array. Let's reshape one.
img = np.reshape(mnist.train.images[5], [28,28])
img = mnist.train.images[5].reshape([28,28])
# So now we have a 28x28 matrix, where each element is an intensity level from 0 to 1.
img.shape
Let's visualize what some of these images and their corresponding training labels look like.
plt.figure(figsize = (6,6))
plt.imshow(img, 'gray')
plt.xticks([])
plt.yticks([])
plt.show()
mnist.train.labels[5]
np.argmax(mnist.train.labels[5])
Batch maker embedded
x, y = mnist.train.next_batch(3)
print(x.shape)
print(y.shape)
# Import Library
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
train_x, train_y = mnist.train.next_batch(1)
img = train_x[0,:].reshape(28,28)
plt.figure(figsize=(6,6))
plt.imshow(img,'gray')
plt.title("Label : {}".format(np.argmax(train_y[0,:])))
plt.xticks([])
plt.yticks([])
plt.show()
One hot encoding
print ('Train labels : {}'.format(train_y[0, :]))
n_input = 28*28
n_hidden = 100
n_output = 10
weights = {
'hidden' : tf.Variable(tf.random_normal([n_input, n_hidden], stddev = 0.1)),
'output' : tf.Variable(tf.random_normal([n_hidden, n_output], stddev = 0.1))
}
biases = {
'hidden' : tf.Variable(tf.random_normal([n_hidden], stddev = 0.1)),
'output' : tf.Variable(tf.random_normal([n_output], stddev = 0.1))
}
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
First, the layer performs several matrix multiplication to produce a set of linear activations
Second, each linear activation is running through a nonlinear activation function
Third, predict values with an affine transformation
# Define Network
def build_model(x, weights, biases):
# first hidden layer
hidden = tf.add(tf.matmul(x, weights['hidden']), biases['hidden'])
# non-linear activate function
hidden = tf.nn.relu(hidden)
# Output layer
output = tf.add(tf.matmul(hidden, weights['output']), biases['output'])
return output
Loss
Optimizer
# Define Loss
pred = build_model(x, weights, biases)
loss = tf.nn.softmax_cross_entropy_with_logits(logits = pred, labels = y)
loss = tf.reduce_mean(loss)
LR = 0.0001
optm = tf.train.AdamOptimizer(LR).minimize(loss)
n_batch: batch size for mini-batch gradient descentn_iter: the number of iteration stepsn_prt: check loss for every n_prt iterationInitializer
n_batch = 50 # Batch Size
n_iter = 5000 # Learning Iteration
n_prt = 250 # Print Cycle
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
loss_record_train = []
loss_record_test = []
for epoch in range(n_iter):
train_x, train_y = mnist.train.next_batch(n_batch)
sess.run(optm, feed_dict = {x: train_x, y: train_y})
if epoch % n_prt == 0:
test_x, test_y = mnist.test.next_batch(n_batch)
c1 = sess.run(loss, feed_dict = {x: train_x, y: train_y})
c2 = sess.run(loss, feed_dict = {x: test_x, y: test_y})
loss_record_train.append(c1)
loss_record_test.append(c2)
print ("Iter : {}".format(epoch))
print ("Cost : {}".format(c1))
plt.figure(figsize=(10,8))
plt.plot(np.arange(len(loss_record_train))*n_prt,
loss_record_train, label = 'training')
plt.plot(np.arange(len(loss_record_test))*n_prt,
loss_record_test, label = 'testing')
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.legend(fontsize = 12)
plt.ylim([0, np.max(loss_record_train)])
plt.show()
test_x, test_y = mnist.test.next_batch(100)
my_pred = sess.run(pred, feed_dict = {x : test_x})
my_pred = np.argmax(my_pred, axis = 1)
labels = np.argmax(test_y, axis = 1)
accr = np.mean(np.equal(my_pred, labels))
print("Accuracy : {}%".format(accr*100))
test_x, test_y = mnist.test.next_batch(1)
logits = sess.run(tf.nn.softmax(pred), feed_dict = {x : test_x})
predict = np.argmax(logits)
plt.figure(figsize = (6,6))
plt.imshow(test_x.reshape(28,28), 'gray')
plt.xticks([])
plt.yticks([])
plt.show()
print('Prediction : {}'.format(predict))
np.set_printoptions(precision = 2, suppress = True)
print('Probability : {}'.format(logits.ravel()))
You may observe that the accuracy on the test dataset is a little lower than the accuracy on the training dataset. This gap between training accuracy and test accuracy is an example of overfitting, when a machine learning model performs worse on new data than on its training data.
What is the highest accuracy you can achieve with this first fully connected model? Since the handwritten digit classification task is pretty straightforward, you may be wondering how we can do better...
$\Rightarrow$ As we saw in lecture, convolutional neural networks (CNNs) are particularly well-suited for a variety of tasks in computer vision, and have achieved near-perfect accuracies on the MNIST dataset. We will build a CNN and ultimately output a probability distribution over the 10 digit classes (0-9) in the next lectures.
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')